Skip to main content

General Idea of writing code for Classification

Import the Necessary Librariesโ€‹

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

Load the Data into a dataframeโ€‹

# Load the data from CSV file
# df = pd.read_csv(r"D:\Stuff\CyberSec\archive\03-02-2018.csv")
df = pd.read_csv(r"..\Datasets\IDS2018\02-14-2018.csv")

# Remove any rows with missing values
# df = df.dropna()

# Drop columns where all values are 0
# df = df.loc[:, (df != 0).any(axis=0)]

# to select first n rows only
# df = df.iloc[:n,:]

To print the features available in our modelโ€‹

for i, col_name in enumerate(df.columns):
print(f"Feature {i+1}:\t\"{col_name}\"")

Accuracy for Decision Treeโ€‹

# specify column indexes to select
selected_cols_idx = [1,2,4,5,6,11,15,19,29,33,34,35,40,46,48,58,59,62,66]

selected_cols_idx = [x - 1 for x in selected_cols_idx]

# select columns by index using iloc
X = df.iloc[:, selected_cols_idx].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)

# Train the decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Test the classifier
accuracy = clf.score(X_test, y_test)

# Get the names of the selected columns
selected_cols = list(df.columns[selected_cols_idx])

print("Accuracy for the following features combined", selected_cols, "is:", accuracy)

Accuracy for Random Forestโ€‹

# specify column indexes to select
selected_cols_idx = [1,2,4,5,6,11,15,19,29,33,34,35,40,46,48,58,59,62,66]
selected_cols_idx = [x - 1 for x in selected_cols_idx]

# select columns by index using iloc
X = df.iloc[:, selected_cols_idx].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)

# Train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Test the classifier
accuracy = rf.score(X_test, y_test)

# Get the names of the selected columns
selected_cols = list(df.columns[selected_cols_idx])

print("Accuracy for the following features combined", selected_cols, "is:", accuracy)

Accuracy for KNNโ€‹

# specify column indexes to select
selected_cols_idx = [1,2,4,5,6,11,15,19,29,33,34,35,40,46,48,58,59,62,66]
selected_cols_idx = [x - 1 for x in selected_cols_idx]

# select columns by index using iloc
X = df.iloc[:, selected_cols_idx].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)

# Train the KNN classifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Test the classifier
accuracy = knn.score(X_test, y_test)

# Get the names of the selected columns
selected_cols = list(df.columns[selected_cols_idx])

print("Accuracy for the following features combined", selected_cols, "is:", accuracy)

Accuracy for Weighted KNNโ€‹

# specify column indexes to select
selected_cols_idx = [1,2,4,5,6,11,15,19,29,33,34,35,40,46,48,58,59,62,66]
selected_cols_idx = [x - 1 for x in selected_cols_idx]

# select columns by index using iloc
X = df.iloc[:, selected_cols_idx].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)

# Train the weighted KNN classifier
knn = KNeighborsClassifier(weights='distance')
knn.fit(X_train, y_train)

# Test the classifier
accuracy = knn.score(X_test, y_test)

# Get the names of the selected columns
selected_cols = list(df.columns[selected_cols_idx])

print("Accuracy for the following features combined", selected_cols, "is:", accuracy)

Accuracy for Gaussian Naive Bayesโ€‹

# Specify column indexes to select
selected_cols_idx = [1,2,4,5,6,11,15,19,29,33,34,35,40,46,48,58,59,62,66]
selected_cols_idx = [x - 1 for x in selected_cols_idx]

# Select columns by index using iloc
X = df.iloc[:, selected_cols_idx].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)

# Train the Gaussian Naive Bayes classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Test the classifier
accuracy = gnb.score(X_test, y_test)

# Get the names of the selected columns
selected_cols = list(df.columns[selected_cols_idx])

print("Accuracy for the following features combined", selected_cols, "is:", accuracy)

Accuracy for MLPโ€‹

# specify column indexes to select
selected_cols_idx = [1, 2, 4, 5, 6, 11, 15, 19, 29, 33, 34, 35, 40, 46, 48, 58, 59, 62, 66]
selected_cols_idx = [x - 1 for x in selected_cols_idx]

# select columns by index using iloc
X = df.iloc[:, selected_cols_idx].values
y = df.iloc[:, -1].values

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# instantiate the MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=300, alpha=0.0001,
solver='adam', random_state=42, tol=0.0001)

# train the MLP classifier
mlp.fit(X_train, y_train)

# test the MLP classifier
accuracy = mlp.score(X_test, y_test)

# get the names of the selected columns
selected_cols = list(df.columns[selected_cols_idx])

print("Accuracy for the following features combined", selected_cols, "is:", accuracy)

Accuracy for QDAโ€‹

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# specify column indexes to select
selected_cols_idx = [1, 2, 4, 5, 6, 11, 15, 19, 29, 33, 34, 35, 40, 46, 48, 58, 59, 62, 66]
selected_cols_idx = [x - 1 for x in selected_cols_idx]

# select columns by index using iloc
X = df.iloc[:, selected_cols_idx].values
y = df.iloc[:, -1].values

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# instantiate the QDA classifier
qda = QuadraticDiscriminantAnalysis(reg_param=0.1)

# train the QDA classifier
qda.fit(X_train, y_train)

# test the QDA classifier
accuracy = qda.score(X_test, y_test)

# get the names of the selected columns
selected_cols = list(df.columns[selected_cols_idx])

print("Accuracy for the following features combined", selected_cols, "is:", accuracy)

Get the Notebook from hereโ€‹

CyberSec-NGIT/ml_imp_features.ipynb at main ยท stealthspectre/CyberSec-NGIT (github.com)